Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / libwais / ir / irbuild.c < prev next >

Wrap

C/C++ Source or Header | 1995-05-03 | 33.7 KB | 950 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ #ifndef lint static char *RCSid = "$Header: /y/src/wais/wais-8-b5/ir/RCS/irbuild.c,v 1.47.1.1 1992/07/11 01:01:21 curtisg Exp curtisg $"; #endif /* * Building an index with a Unix shell interface. * * -brewster 6/90 */ /* Change log: * added -stdio option from jik@athena.mit.edu * $Log: irbuild.c,v $ * * M000, 28-May-93, hess * mods for ERG database files. * same as mail type, but bails out after ~200 lines, avoids large, large * indexes..., since many times the "From " portion of the file is indented * our mail threads look like one large piece of mail, which is ok, they * are intended to be read as a whole. But we still need to make sure * they are'nt broken up, so never let the seperator routine return true * * Revision 1.47.1.1 1992/07/11 01:01:21 curtisg * Changes for SCO UNIX * * Revision 1.47 92/05/10 14:48:17 jonathan * Updated for release. * * Revision 1.46 92/05/08 10:03:17 jonathan * Adjusted memory paramters. It's closer... * * Revision 1.45 92/05/06 17:26:46 jonathan * Added switch for indexing contents, new user-specified type name, new type: * filename, which only puts the name of the file in the header. * * Revision 1.44 92/04/25 21:14:35 brewster * added ziff * * Revision 1.43 92/04/22 15:29:13 jonathan * Added jargon to usage message. * * Revision 1.42 92/04/01 17:08:50 jonathan * Added FTP type. * * Revision 1.41 92/03/25 18:49:39 jonathan * Added log_level and log_file arguments. * * Revision 1.40 92/03/22 18:38:14 brewster * added objective C filter * * Revision 1.39 92/03/20 11:02:44 jonathan * Added code to handle switches for word_pairs and word_postition info. * * Revision 1.38 92/03/17 07:34:32 jonathan * Fixed spacing in usage message. * * Revision 1.37 92/03/10 10:42:51 morris * fixed small bug in command line argument handleing. doesn't die if there * are no args. * * Revision 1.36 92/03/05 07:05:32 shen * add cm grow percent and textsize to command line and init search engine * * Revision 1.35 92/03/04 16:34:09 jonathan * Set wais_pid from getpid(). * * Revision 1.34 92/02/20 09:49:37 jonathan * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl. * * Revision 1.33 92/02/17 14:21:08 jonathan * Added switch to disable creation of catalog (-nocat). * * Revision 1.32 92/02/17 12:41:55 jonathan * Added RCSid. * * Revision 1.31 92/02/17 12:41:01 jonathan * Build catalog after completion of indexing. * * Revision 1.30 92/02/12 13:22:53 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* to do: * done: make incremental indexing not index things that are already index * add extra arg -register that will send in description of the server to * the directory of servers. * done: create a source struct in the .src file * make it continuously index to keep itself uptodate. * */ #include <string.h> #include <sys/types.h> #include <sys/param.h> #include <sys/stat.h> #include "irdirent.h" #include "cutil.h" #include "futil.h" #include "irfiles.h" #include "irtfiles.h" #include "panic.h" #include "ircfiles.h" #include "version.h" #include "irext.h" #define INDEXER_DATE "Sun May 10 1992" /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */ extern boolean indexingForBeta; struct file_type { char *name; char *description; char *type; boolean (*separator_function)(); void (*header_function)(); long (*date_function)(); void (*finish_header_function)(); boolean index_contents; } file_type_list[] = { {"groliers", "groliers encyclopedia special format", "TEXT", groliers_separator_function, groliers_header_function, 0, groliers_finish_header_function, 1}, #ifdef NEXT {"objc", "objective-C .h and .m files", "TEXT", qobjc_separator_function, wobj_header_function, 0, wobj_finish_header_function, 1}, #endif /* NEXT */ {"mail", "mail format", "TEXT", mail_or_rmail_separator, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"mail_or_rmail", "mail or rmail or both", "TEXT", mail_or_rmail_separator, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"mail_digest", "standard internet mail digest format", "TEXT", mail_digest_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"mh_bboard", "MH bulletin board format", "TEXT", mh_bboard_separator_function, mail_header_function, 0, mail_finish_header_function, 1}, {"rmail", "rmail format", "TEXT", rmail_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"netnews", "netnews format", "TEXT", 0, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"rn", "netnews saved by the [rt]?rn newsreader", "TEXT", rn_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1}, {"emacsinfo", "the GNU documentation system", "TEXT", emacs_info_separator_function, emacs_info_header_function, 0, emacs_info_finish_header_function, 1}, {"catalog", "??", "TEXT", catalog_separator_function, catalog_header_function, 0, catalog_finish_header_function, 1}, {"bio", "biology abstract format", "TEXT", bio_separator_function, bio_header_function, 0, bio_finish_header_function, 1}, {"cmapp", "CM applications from Hypercard", "TEXT", cmapp_separator_function, cmapp_header_function, 0, cmapp_finish_header_function, 1}, {"ftp", "special type for FTP files. First line of file is headline", "TEXT", first_line_separator_function, first_line_header_function, 0, first_line_finish_header_function, 1}, {"jargon", "Jargon File 2.9.8 format", "TEXT", jargon_separator_function, jargon_header_function, 0, jargon_finish_header_function, 1}, {"server", "server structures for the dir of servers", "WSRC", 0, 0, 0, filename_finish_header_function}, {"text", "simple text files, this is the default", "TEXT", 0,0,0,0, 1}, {"filename", "uses only the filename part of the pathname for the title", "TEXT", 0,0,0, filename_finish_header_function, 1}, {"irg", "internet resource guide", "TEXT", irg_separator_function, irg_header_function, 0, irg_finish_header_function, 1}, {"dash", "entries separated by a row of dashes", "TEXT", dash_separator_function, dash_header_function, 0, dash_finish_header_function, 1}, {"one_line", "each line is a document", "TEXT", one_line_separator_function, one_line_header_function, 0, one_line_finish_header_function, 1}, {"para", "paragraphs separated by blank lines", "TEXT", para_separator_function, para_header_function, 0, para_finish_header_function, 1}, {"seeker", "??", "TEXT", seeker_separator_function, seeker_header_function, 0, seeker_finish_header_function, 1}, {"medline", "medline format", "TEXT", medline_separator_function, medline_header_function, 0, medline_finish_header_function, 1}, {"refer", "refer format", "TEXT", refer_separator_function, refer_header_function, 0, refer_finish_header_function, 1}, {"first_line", "first line of file is headline", "TEXT", first_line_separator_function, first_line_header_function, 0, first_line_finish_header_function, 1}, {"rlin", "??", "TEXT", rlin_separator_function, rlin_header_function, 0, rlin_finish_header_function, 1}, {"dvi", "dvi format", "DVI", 0, 0, 0, filename_finish_header_function, 1}, {"ps", "postscript format ", "PS", 0, 0, 0, filename_finish_header_function, 0}, {"pict", "pict files, only indexes the filename", "PICT", 0, 0, 0, filename_finish_header_function, 0}, {"gif", "gif files, only indexes the filename", "GIF", 0, 0, 0, filename_finish_header_function, 0}, {"tiff", "tiff files, only indexes the filename", "TIFF", 0, 0, 0, filename_finish_header_function, 0}, {"bibtex", "BibTeX / LaTeX format", "TEXT", bibtex_separator_function, bibtex_header_function, 0, bibtex_finish_header_function, 1}, {"nhyp", "?:? hyper text format, Polytechnic of Central London", "TEXT", nhyp_separator_function, nhyp_header_function, 0, nhyp_finish_header_function, 1}, {"ziff", "ziff special format", "TEXT", ziff_separator_function, ziff_header_function, 0, ziff_finish_header_function, 1}, #ifdef /* sco */ M_UNIX {"erg_mail_thread", "SCO ERG mail thread format", "TEXT", erg_thread_separator_function, erg_thread_header_function, mail_date_function, erg_thread_finish_header_function, 1}, {"mmdf", "MMDF mail folder format", "TEXT", mmdf_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1}, { "change_desc", "SCO ERG change descriptions format", "TEXT", first_line_separator_function, erg_cd_header_function, mail_date_function, /* fix this later, can look in the cd */ erg_cd_finish_header_function, 1}, #endif 0 }; void usage(command) char *command; { /* no args */ struct file_type *t; int first; fprintf(stderr,"Usage: %s [-d index_filename]\n", command); fprintf(stderr," [-a] /* adding to an existing index, otherwise it erases the index */\n"); fprintf(stderr," [-r] /* recursively index subdirectories */\n"); fprintf(stderr," [-mem mbytes] /* number of megabytes to run this in */\n"); fprintf(stderr," [-register] /* registers the database with the directory of servers.\n"); fprintf(stderr," This should be done with care. */\n"); fprintf(stderr," [-export] /* uses short dbname and port 210 */\n"); fprintf(stderr," [-e [file]] /* set log output to file, or /dev/null if not specified */\n"); fprintf(stderr," [-f [filter]] /* run filter on each file before indexing */\n"); fprintf(stderr," [-l log_level] /* set log level. 0 means log nothing,\n"); fprintf(stderr," 10 [the default] means log everything */\n"); fprintf(stderr," [-v] /* print the version of the software */\n"); fprintf(stderr," [-stdin] /* read file names from stdin */\n"); fprintf(stderr," [-pos | -nopos] /* include (don't include - default) word position information /*\n"); fprintf(stderr," [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n"); fprintf(stderr," [-nocat] /* inhibit creation of catalog /*\n"); fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n"); fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n"); fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n"); fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n"); fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n"); fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n"); for (t=file_type_list, first=0; t->name; t++, first++) { fprintf(stderr," %c %s /* %s */\n", first ? '|' : ' ', t->name, t->description); } #if 0 fprintf(stderr," text /* simple text files, this is the default */\n"); fprintf(stderr," | bibtex /* BibTeX / LaTeX format */\n"); fprintf(stderr," | bio /* biology abstract format */\n"); fprintf(stderr," | cmapp /* CM applications from Hypercard */\n"); fprintf(stderr," | dash /* entries separated by a row of dashes */\n"); fprintf(stderr," | dvi /* dvi format */\n"); fprintf(stderr," | emacsinfo /* the GNU documentation system */\n"); fprintf(stderr," | first_line /* first line of file is headline */\n"); fprintf(stderr," | filename /* uses only the filename part of the pathname for the title */\n"); fprintf(stderr," | ftp /* special type for FTP files. First line of file is headline */\n"); fprintf(stderr," | gif /* gif files, only indexes the filename */\n"); fprintf(stderr," | irg /* internet resource guide */\n"); fprintf(stderr," | jargon /* Jargon File 2.9.8 format*/\n"); fprintf(stderr," | mail_digest /* standard internet mail digest format */\n"); fprintf(stderr," | mail_or_rmail /* mail or rmail or both */\n"); fprintf(stderr," | medline /* medline format */\n"); fprintf(stderr," | mh_bboard /* MH bulletin board format */\n"); fprintf(stderr," | netnews /* netnews format */\n"); fprintf(stderr," | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n"); fprintf(stderr," | one_line /* each line is a document */\n"); fprintf(stderr," | para /* paragraphs separated by blank lines */\n"); fprintf(stderr," | pict /* pict files, only indexes the filename */\n"); fprintf(stderr," | ps /* postscript format */\n"); fprintf(stderr," | refer /* refer format */\n"); fprintf(stderr," | rn /* netnews saved by the [rt]?rn newsreader */\n"); fprintf(stderr," | server /* server structures for the dir of servers */\n"); #ifdef NeXT fprintf(stderr," | objc /* objective-C .h and .m files */\n"); #endif /* def NeXT */ fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n"); #endif /* 0 */ fprintf(stderr," ] filename filename ...\n"); } char *log_file_name = NULL; FILE *logfile; extern boolean index_contents; extern boolean filter_contents; extern char filter_program[]; #define set(a,b) if (b) (a)=(b) /* This is the MAIN for building an index. */ void main(argc, argv) int argc; char *argv[]; { database* db = NULL; long argc_copy = argc; char **argv_copy = argv; char *next_argument; char index_filename[1000]; boolean (*separator_function)(); void (*header_function)(); void (*finish_header_function)(); long (*date_function)(); boolean adding_to_existing_index = false; boolean traverse_directory = false; boolean word_positions = false; boolean word_pairs = true; long memory_to_use = -1; long cm_mem_percent = 0; /* default */ long grow_percent = 0; /* default */ long text_size = 0; /* default */ boolean check_for_text_file = false; boolean register_database = false; boolean export_database = false; boolean read_files_from_stdin = false; boolean make_catalog = true; char data_filename[MAX_PATH_NAME_LEN]; char *typename = NULL; /* this is what the user said */ char type[256]; /* char *type = NULL; /* this is the type stored with the db */ long start_of_filenames; long hashtable_size = 1L<<16; long flush_after_n_words = 300000; char *command_name; struct file_type *t; next_argument = next_arg(&argc, &argv); separator_function = NULL; /* initailize to nil */ header_function = NULL; date_function = NULL; finish_header_function = NULL; /* type = "TEXT"; /* default to text */ strcpy(type, "TEXT"); typename = "Text"; command_name = next_argument; logfile = stderr; wais_pid = getpid(); if(0 == argc) { usage(command_name); exit(0); } #ifdef THINK_C strcpy(index_filename, "wais:System Folder:wais-index:index"); #else strcpy(index_filename, "index"); /* in the current directory */ #endif /* THINK_C */ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"No arguments specified\n"); exit(0); } while((next_argument != NULL) && '-' == next_argument[0]){ /* then we have an argument to process */ if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */ (0 == strcmp("-d", next_argument))){ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected filename for the index\n"); exit(0); } strcpy(index_filename, next_argument); } else if(0 == strcmp("-a", next_argument)){ adding_to_existing_index = true; } else if(0 == strcmp("-r", next_argument)){ traverse_directory = true; } else if(0 == strcmp("-register", next_argument)){ register_database = true; } else if(0 == strcmp("-export", next_argument)){ export_database = true; } else if(0 == strcmp("-f", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected filter for -f\n"); exit(1); } strcpy(filter_program, next_argument); filter_contents = TRUE; } else if(0 == strcmp("-v", next_argument)){ fprintf(stderr,"%s: %s\n", command_name, VERSION, INDEXER_DATE); } else if (0 == strcmp("-stdin", next_argument)) { read_files_from_stdin = true; } else if (0 == strcmp("-nopos", next_argument)) { word_positions = false; } else if (0 == strcmp("-pos", next_argument)) { word_positions = true; } else if (0 == strcmp("-nopairs", next_argument)) { word_pairs = false; } else if (0 == strcmp("-pairs", next_argument)) { word_pairs = true; } else if (0 == strcmp("-nocat", next_argument)) { make_catalog = false; } else if(0 == strcmp("-mem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for the amount of memory to use"); memory_to_use = atol(next_argument); if(memory_to_use < 1) panic("The -mem argument should not be less than 1"); if(memory_to_use > 200) fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use); } else if(0 == strcmp("-cmmem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for percentage of memory to use"); cm_mem_percent = atol(next_argument); if(cm_mem_percent < 1) panic("The -cmmem argument should not be less than 1 and less than 100"); if(cm_mem_percent > 100) panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent); } else if(0 == strcmp("-grow", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for database growing percentage"); grow_percent = atol(next_argument); if(grow_percent < 1) panic("The -grow argument should not be less than 1"); } else if(0 == strcmp("-textsize", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for text size in megabytes"); text_size = atol(next_argument); if(text_size < 1) panic("The -textsize argument should not be less than 1"); } else if (0 == strcmp("-e", next_argument)) { char *peek_argument = peek_arg(&argc, &argv); log_file_name = "/dev/null"; /* default to /dev/null */ if ((peek_argument != NULL) && ('-' != peek_argument[0])) { log_file_name = next_arg(&argc, &argv); } /* end if (explicit log file) */ } /* end if (-e) */ else if (0 == strcmp("-l", next_argument)) { wais_log_level = atol(next_arg(&argc, &argv)); } /* end if (-l) */ else if(0 == strcmp("-cm", next_argument)){ /* this is an undocumented argument to help use this to front end the CM application */ indexingForBeta = true; } else if(0 == strcmp("-T", next_argument)){ /* This is a specification for a "Special" type. The next argument is the type name. This will not index the body of the file. */ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); typename = next_argument; /* type = next_argument;*/ strcpy(type, next_argument); finish_header_function = filename_finish_header_function; } else if(0 == strcmp("-contents", next_argument)){ index_contents = true; } else if(0 == strcmp("-nocontents", next_argument)){ index_contents = false; } else if(0 == strcmp("-t", next_argument)){ /* then we have a specialized file */ index_contents = true; if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); for(t = file_type_list; t->name; t++) { if (strcmp(t->name, next_argument) == 0) { typename = t->name; strcpy(type, t->type); set(separator_function, t->separator_function); set(header_function, t->header_function); set(date_function, t->date_function); set(finish_header_function, t->finish_header_function); set(index_contents, t->index_contents); goto found; } } panic("Don't recognize the '%s' type", next_argument); found: ; #if 0 if(0 == strcmp("groliers", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = groliers_separator_function; header_function = groliers_header_function; finish_header_function = groliers_finish_header_function; } #ifdef NeXT else if(0 == strcmp("objc", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = wobjc_separator_function; header_function = wobjc_header_function; finish_header_function = wobjc_finish_header_function; } #endif /* def NeXT */ else if(0 == strcmp("mail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("erg_mail_thread", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = erg_thread_separator_function; header_function = erg_thread_header_function; date_function = mail_date_function; finish_header_function = erg_thread_finish_header_function; } else if(0 == strcmp("mail_or_rmail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_or_rmail_separator; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_digest", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_digest_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mh_bboard", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mh_bboard_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rmail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = rmail_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("netnews", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = NULL; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rn", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = rn_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("emacsinfo", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = emacs_info_separator_function; header_function = emacs_info_header_function; finish_header_function = emacs_info_finish_header_function; } else if(0 == strcmp("catalog", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = catalog_separator_function; header_function = catalog_header_function; finish_header_function = catalog_finish_header_function; } else if(0 == strcmp("bio", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = bio_separator_function; header_function = bio_header_function; finish_header_function = bio_finish_header_function; } else if(0 == strcmp("cmapp", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = cmapp_separator_function; header_function = cmapp_header_function; finish_header_function = cmapp_finish_header_function; } else if(0 == strcmp("ftp", next_argument)){ type = "TEXT-FTP"; typename = next_argument; separator_function = first_line_separator_function; header_function = first_line_header_function; finish_header_function = first_line_finish_header_function; } else if(0 == strcmp("jargon", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = jargon_separator_function; header_function = jargon_header_function; finish_header_function = jargon_finish_header_function; } else if(0 == strcmp("server", next_argument)){ typename = next_argument; type = "WSRC"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("text", next_argument)){ type = "TEXT"; typename = next_argument; check_for_text_file = true; } else if(0 == strcmp("filename", next_argument)){ type = "TEXT"; typename = next_argument; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("irg", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = irg_separator_function; header_function = irg_header_function; finish_header_function = irg_finish_header_function; } /* dash-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("dash", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = dash_separator_function; header_function = dash_header_function; finish_header_function = dash_finish_header_function; } /* one_line-separated items */ else if(0 == strcmp("one_line", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = one_line_separator_function; header_function = one_line_header_function; finish_header_function = one_line_finish_header_function; } /* blank line-separated items (paragraphs) */ else if(0 == strcmp("para", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = para_separator_function; header_function = para_header_function; finish_header_function = para_finish_header_function; } /* seeker items */ else if(0 == strcmp("seeker", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = seeker_separator_function; header_function = seeker_header_function; finish_header_function = seeker_finish_header_function; } /* medline format */ else if(0 == strcmp("medline", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = medline_separator_function; header_function = medline_header_function; finish_header_function = medline_finish_header_function; } /* refer format */ else if(0 == strcmp("refer", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = refer_separator_function; header_function = refer_header_function; finish_header_function = refer_finish_header_function; } /* first_line format */ else if(0 == strcmp("first_line", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = first_line_separator_function; header_function = first_line_header_function; finish_header_function = first_line_finish_header_function; } /* rlin items */ else if(0 == strcmp("rlin", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = rlin_separator_function; header_function = rlin_header_function; finish_header_function = rlin_finish_header_function; } else if(0 == strcmp("dvi", next_argument)){ typename = next_argument; type = "DVI"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("ps", next_argument)){ typename = next_argument; type = "PS"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("pict", next_argument)){ typename = next_argument; type = "PICT"; finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("gif", next_argument)){ typename = next_argument; type = "GIF"; finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("tiff", next_argument)){ typename = next_argument; type = "TIFF"; finish_header_function = filename_finish_header_function; index_contents = false; } /* BibTeX items */ else if(0 == strcmp("bibtex", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = bibtex_separator_function; header_function = bibtex_header_function; finish_header_function = bibtex_finish_header_function; } /* ?:? seperated hypertext items */ else if(0 == strcmp("nhyp", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = nhyp_separator_function; header_function = nhyp_header_function; finish_header_function = nhyp_finish_header_function; } else if(0 == strcmp("ziff", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = ziff_separator_function; header_function = ziff_header_function; finish_header_function = ziff_finish_header_function; } else{ panic("Don't recognize the '%s' type", next_argument); } #endif /* 0 */ } else{ panic("Don't recognize the '%s' option", next_argument); } next_argument = next_arg(&argc, &argv); if (! (read_files_from_stdin || next_argument)) { fprintf(stderr,"No files specified\n"); exit(0); } } start_of_filenames = argc_copy - argc - 1; /* check index */ if(0 == strlen(pathname_name(index_filename))){ waislog(WLOG_HIGH, WLOG_ERROR, "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.", index_filename); exit(0); } waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s", index_filename); if(0 != init_search_engine(index_filename, false, false, cm_mem_percent, text_size, grow_percent)) panic("unable to initialize search engine"); if(true == adding_to_existing_index){ db = openDatabase(index_filename, false, false); if (db == NULL){ /* does not exist, create one */ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } } else{ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } { /* set up the memory hashtable */ if(memory_to_use < 0){ /* default */ /* do nothing */ } else if(memory_to_use <= 2){ hashtable_size = 1L<<16; flush_after_n_words = 50000; } else if(memory_to_use <= 5){ hashtable_size = 1L<<16; flush_after_n_words = 150000; } else if(memory_to_use <= 10){ /* shown to take about 6MB on a sun4, when it is dict limited */ hashtable_size = 1L<<16; flush_after_n_words = 300000; } else if(memory_to_use <= 20){ hashtable_size = 1L<<17; flush_after_n_words = 600000; } else{ /* over 20 Mbytes */ hashtable_size = 1L<<18; flush_after_n_words = 1000000; } init_add_word(db, hashtable_size, flush_after_n_words); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAX_PATH_NAME_LEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } while(NULL != next_argument){ /* the first filename is in next_argument already */ if(directoryp(next_argument)){ if(traverse_directory){ index_directory(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs); } } else{ /* not a directory */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", next_argument); index_text_file(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAX_PATH_NAME_LEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } else { next_argument = next_arg(&argc, &argv); } } finished_add_word(db); { char filename[MAX_FILENAME_LEN + 1]; if(!probe_file(source_filename(filename, db))){ char database_name[MAX_FILENAME_LEN]; write_src_structure(source_filename(filename, db), export_database?pathname_name(index_filename): truename(index_filename, database_name), typename, &argv_copy[start_of_filenames], argc_copy - start_of_filenames, export_database, 210L); } /* write out a description of the server if appropriate */ if(register_database){ register_src_structure(source_filename(filename, db)); } } if(make_catalog) build_catalog(db); closeDatabase(db); waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build"); exit(0); }